In [13]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In [14]:
df = pd.read_csv('https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv',encoding='iso8859_15')
df = df.dropna(axis=1)
X = df.drop('IsDepDelayed',axis=1)
y = df['IsDepDelayed']
In [15]:
df.dtypes
Out[15]:
In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
class EncodeCategorical(BaseEstimator, TransformerMixin):
"""
Encodes a specified list of columns or all columns if None.
"""
def __init__(self, columns=None):
self.columns = columns
self.encoders = None
def fit(self, data, target=None):
"""
Expects a data frame with named columns to encode.
"""
# Encode all columns if columns is None
if self.columns is None:
self.columns = data.columns
# Fit a label encoder for each column in the data frame
self.encoders = {
column: LabelEncoder().fit(data[column])
for column in self.columns
}
return self
def transform(self, data):
"""
Uses the encoders to transform a data frame.
"""
output = data.copy()
for column, encoder in self.encoders.items():
output[column] = encoder.transform(data[column])
return output
def inverse_transform(self, data):
"""
Uses the encoders to transform a data frame.
"""
output = data.copy()
for column, encoder in self.encoders.items():
output[column] = encoder.inverse_transform(data[column])
return output
encoder = EncodeCategorical(['UniqueCarrier','Origin','Dest','IsArrDelayed'])
y_encoder = EncodeCategorical(['IsDepDelayed'])
In [17]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
cols = ['Year', 'Month', 'DayofMonth','DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'Origin', 'Dest','IsDepDelayed']
df = df[cols]
xgb = XGBClassifier()
pipeline = Pipeline([
("label_encoder", encoder),
('Classifier',xgb)
])
In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
In [19]:
pipeline.fit(X_train,y_train)
Out[19]:
In [20]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
predictions = pipeline.predict(X_test)
le = LabelEncoder()
le.fit(y_test)
print('accurancy: %f '% accuracy_score(le.transform(y_test), le.transform(predictions)))
print('auc: %f' % roc_auc_score(le.transform(y_test), le.transform(predictions)))
In [21]:
print(xgb.feature_importances_)
In [22]:
from xgboost import plot_importance
%matplotlib inline
plot_importance(xgb)
Out[22]:
In [23]:
X
Out[23]:
In [ ]:
In [ ]: